setwd("D:/BINUS/Semester4/DataMiningVisualization/FinalProject")
df <- read.csv('Crime_Data_from_2020_to_Present.csv')
head(df)
##       DR_NO              Date.Rptd               DATE.OCC TIME.OCC AREA
## 1 190326475 03/01/2020 12:00:00 AM 03/01/2020 12:00:00 AM     2130    7
## 2 200106753 02/09/2020 12:00:00 AM 02/08/2020 12:00:00 AM     1800    1
## 3 200320258 11/11/2020 12:00:00 AM 11/04/2020 12:00:00 AM     1700    3
## 4 200907217 05/10/2023 12:00:00 AM 03/10/2020 12:00:00 AM     2037    9
## 5 220614831 08/18/2022 12:00:00 AM 08/17/2020 12:00:00 AM     1200    6
## 6 231808869 04/04/2023 12:00:00 AM 12/01/2020 12:00:00 AM     2300   18
##   AREA.NAME Rpt.Dist.No Part.1.2 Crm.Cd
## 1  Wilshire         784        1    510
## 2   Central         182        1    330
## 3 Southwest         356        1    480
## 4  Van Nuys         964        1    343
## 5 Hollywood         666        2    354
## 6 Southeast        1826        2    354
##                                Crm.Cd.Desc             Mocodes Vict.Age
## 1                         VEHICLE - STOLEN                            0
## 2                    BURGLARY FROM VEHICLE      1822 1402 0344       47
## 3                            BIKE - STOLEN           0344 1251       19
## 4 SHOPLIFTING-GRAND THEFT ($950.01 & OVER)           0325 1501       19
## 5                        THEFT OF IDENTITY 1822 1501 0930 2004       28
## 6                        THEFT OF IDENTITY 1822 0100 0930 0929       41
##   Vict.Sex Vict.Descent Premis.Cd                                  Premis.Desc
## 1        M            O       101                                       STREET
## 2        M            O       128            BUS STOP/LAYOVER (ALSO QUERY 124)
## 3        X            X       502 MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)
## 4        M            O       405                               CLOTHING STORE
## 5        M            H       102                                     SIDEWALK
## 6        M            H       501                       SINGLE FAMILY DWELLING
##   Weapon.Used.Cd Weapon.Desc Status  Status.Desc Crm.Cd.1 Crm.Cd.2 Crm.Cd.3
## 1             NA                 AA Adult Arrest      510      998       NA
## 2             NA                 IC  Invest Cont      330      998       NA
## 3             NA                 IC  Invest Cont      480       NA       NA
## 4             NA                 IC  Invest Cont      343       NA       NA
## 5             NA                 IC  Invest Cont      354       NA       NA
## 6             NA                 IC  Invest Cont      354       NA       NA
##   Crm.Cd.4                                 LOCATION Cross.Street     LAT
## 1       NA  1900 S  LONGWOOD                     AV              34.0375
## 2       NA  1000 S  FLOWER                       ST              34.0444
## 3       NA  1400 W  37TH                         ST              34.0210
## 4       NA 14000    RIVERSIDE                    DR              34.1576
## 5       NA                        1900    TRANSIENT              34.0944
## 6       NA  9900    COMPTON                      AV              33.9467
##         LON
## 1 -118.3506
## 2 -118.2628
## 3 -118.3002
## 4 -118.4387
## 5 -118.3277
## 6 -118.2463
df <- df[, !(names(df) %in% c('Crm.Cd.2', 'Crm.Cd.3', 'Crm.Cd.4', 'Cross.Street'))]
df <- df[!(is.na(df$Weapon.Used.Cd) | is.na(df$Weapon.Desc)),]
dim(df)
## [1] 324477     24
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
get_mode <- function(v) {
  uniq_vals <- unique(v)
  uniq_vals[which.max(tabulate(match(v, uniq_vals)))]
}
df$TIME.OCC <- as.numeric(df$TIME.OCC)
df[['Time.h']] = ((df$TIME.OCC - df$TIME.OCC %% 100) / 100) %% 24
loc = df[df$LAT != 0,c('LAT', 'LON')]
gjson <-"./los-angeles-ca_.geojson"
border <- st_read(gjson)
## Reading layer `los-angeles-ca_' from data source 
##   `D:\BINUS\Semester4\DataMiningVisualization\FinalProject\los-angeles-ca_.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 114 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -118.6682 ymin: 33.70454 xmax: -118.1553 ymax: 34.33731
## Geodetic CRS:  WGS 84
ggplot() +
  geom_sf(data = border) +
  geom_point(data=loc, aes(x=LON, y=LAT), size=0.1, alpha=0.025, color='red') + 
  geom_sf(data = border, fill=NA) +
  
  theme_minimal() +
  labs(title = "Map of Crime Locations in Los Angeles", x="Latitude", y="Longitude")

df <- df %>%
  mutate(Vict.Age = na_if(Vict.Age, 0)) %>%
  mutate(Vict.Age = na_if(Vict.Age, -1)) %>%
  mutate(Vict.Age = na_if(Vict.Age, -2)) %>%
  mutate(Vict.Age = na_if(Vict.Age, -4))
           
victim.age <- df %>%
  group_by(Vict.Age) %>%
  summarize(count = n())

fig <- plot_ly(x = ~df$Vict.Age, type = "histogram")
fig <- fig %>% layout(title = 'Victim Age', xaxis=list(title='Victim Age'))
        
fig
## Warning: Ignoring 26011 observations
df[["weapon.generalized"]] <- NA
df$weapon.generalized[df$Weapon.Used.Cd >= 100 & df$Weapon.Used.Cd < 200] <- "Firearm"
df$weapon.generalized[df$Weapon.Used.Cd >= 200 & df$Weapon.Used.Cd < 300] <- "Sharp Object"
df$weapon.generalized[df$Weapon.Used.Cd >= 300 & df$Weapon.Used.Cd < 400] <- "Blunt Object"
df$weapon.generalized[df$Weapon.Used.Cd >= 400 & df$Weapon.Used.Cd < 500] <- "Physical"
df$weapon.generalized[df$Weapon.Used.Cd >= 500 & df$Weapon.Used.Cd < 600] <- "Other/Unknown Weapon"
df$weapon.generalized[df$Weapon.Used.Cd == 511] <- "Treat"
df$weapon.generalized[df$Weapon.Used.Cd == 501] <- "Treat"
df$weapon.generalized[df$Weapon.Used.Cd == 515] <- "Physical"
crm = read.csv('categories.csv', sep=';')
crm <- crm[c('Crm.Cd', 'Crm.Category')]
df <- df%>%
  left_join(crm, by="Crm.Cd")

crime <- df %>%
  group_by(Crm.Category) %>%
  summarize(count = n(), .groups='drop')
colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf", "#ff0000", "#00ff00", "#000", "#f3f3fd")
fig <- plot_ly(crime, labels=~Crm.Category, values=~count, type='pie', marker = list(colors = colors)) %>%
  layout(title = "Distribution of Crime Categories")
fig
df$Vict.Sex <- ifelse(df$Vict.Sex == '', get_mode(df$Vict.Sex), df$Vict.Sex)
df <- df %>%
  mutate(Vict.Sex = recode(Vict.Sex,
                           "M" = "Male",
                           "F" = "Female",
                           "H" = "Others",
                           "X" = "Others"
                           ))
victim.sex <- df %>%
  group_by(Vict.Sex) %>%
  summarize(count = n())

fig <- plot_ly(victim.sex, labels=~Vict.Sex, values=~count, type='pie')
fig <- fig %>% layout(title = 'Victim Sex',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

fig
weapon <- df %>%
  group_by(weapon.generalized)%>%
  summarize(total = n(), .groups="drop")
weapon_used <- df %>%
  group_by(weapon.generalized, Vict.Sex) %>%
  summarize(count = n(), .groups="drop") %>%
  left_join(weapon, by="weapon.generalized") %>%
  mutate(perc = count/total*100)
fig <- plot_ly(weapon_used, x=~weapon.generalized, y=~perc, color=~Vict.Sex, type='bar') %>%
  layout(
    title = "Relation Between Used Weapon and Victim Gender",
    xaxis = list(title='Weapon Used'), 
    yaxis = list(title = 'Percentage'),
    barmode="group"
  )

fig
count.by.time <- df %>%
  group_by(Time.h) %>%
  summarize(total = n(), .groups="drop")

time.occurred <- df %>%
  group_by(weapon.generalized, Time.h) %>%
  summarize(count = n(), .groups='drop') %>%
  filter(weapon.generalized != 'Other/Unknown Weapon') %>%
  left_join(count.by.time, by="Time.h") %>%
  mutate(perc = count/total*100)

fig<-plot_ly(time.occurred, x = ~Time.h, y=~perc , color=~weapon.generalized, type="scatter", mode="lines") %>%
  layout(title="Percentage of Used Weapon over time", xaxis = list(title="Time in Hours"), yaxis = list(title="Percentage"))
fig
color.def <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22")

vs <- df %>%
  group_by(Vict.Sex) %>%
  summarize(total=n(), .groups="drop")

dist <- df %>%
  group_by(Time.h, Vict.Sex) %>%
  summarize(count=n(), .groups="drop") %>%
  left_join(vs, by="Vict.Sex") %>%
  mutate(perc = count/total*100)

fig <- plot_ly(dist, x=~Time.h,y=~perc, color=~Vict.Sex, colors = color.def, type="scatter", mode="line") %>%
  layout(barmode = "overlay",
         title = "Distribution of time by Victim Sex",
         xaxis = list(title = "Time in hour"),
         yaxis = list(title="Percentage")
  )

fig
desc <- df %>%
  group_by(weapon.generalized, Crm.Category) %>%
  summarize(count = log(n()), .groups = 'drop') %>%
  filter(weapon.generalized != 'Other/Unknown Weapon')
plt <- plot_ly(desc, x=~weapon.generalized, y=~Crm.Category, z=~count ,type="heatmap") %>%
  layout(
    title="Heatmap of Weapon and Criminal Category (Natural Log Transformed)",
    xaxis=list(title="Weapon Category"),
    yaxis=list(title="Criminal Category"),
    showlegend=FALSE
         )
plt